| 1 | /* |
| 2 | --- Projectname: Encoder/Decoder (UTF-8 Unicode4.0) FILE Functions --- |
| 3 | (Targetsystem: Crossplatform) |
| 4 | |
| 5 | Author: Dennis Busch (http://www.dennisbusch.de) |
| 6 | |
| 7 | Content: |
| 8 | Functions for storing and restoring full lines of |
| 9 | UTF-8(as described in "Unicode4.0.0 Chapter 3.9") encoded UNICODE |
| 10 | 'string's , 'wstring's and 'uistring's to and from 'FILE's. |
| 11 | |
| 12 | Details: |
| 13 | Unicode characters that can not be held in a single |
| 14 | 'memory cell' of the desired target string type will be read but ignored. |
| 15 | |
| 16 | Footnote1: |
| 17 | "The definition of UTF-8 prohibits encoding character numbers between |
| 18 | U+D800 and U+DFFF" |
| 19 | So characters decoded into that range will be ignored as well and more |
| 20 | important, characters from that range will not be encoded on storing. |
| 21 | |
| 22 | Footnote2: |
| 23 | On storing, the ByteOrderMark(BOM) U+FEFF |
| 24 | will be stored like any other character encoded in UTF-8, |
| 25 | always leading to the octet sequence EF BB BF. |
| 26 | On restoring, any sequence decoded to U+FEFF will be ignored, that is |
| 27 | it gets not appended to the resulting string. |
| 28 | |
| 29 | Footnote3: |
| 30 | The standard allows only unicodes of a maximum of 21 |
| 31 | bits to be encoded or decoded. Even though characters |
| 32 | with the full range of 32bits *could* be stored in UTF-8 |
| 33 | using up to 6 octets, this implementation does *NOT* do so. |
| 34 | It only correctly encodes, decodes up to 4 octets, as defined by said |
| 35 | standard("Unicode4.0.0 Chapter 3.9" UTF-8). |
| 36 | Quote: "Unicode4.0.0 Appendix C.3" |
| 37 | "those five- and six-byte sequences are illegal for the use |
| 38 | of UTF-8 as an encoding form of Unicode characters." |
| 39 | |
| 40 | Footnote4: |
| 41 | This implementation also tries to protect against decoding of invalid |
| 42 | octet sequences. |
| 43 | It does so by silently reading any invalid sequences, but every such |
| 44 | invalid sequence(as it should not be allowed in the first place) will |
| 45 | be skipped and an ASCII '?' is given out as result character. |
| 46 | */ |
| 47 | |
| 48 | #include <cstdio> |
| 49 | #include <cstdlib> |
| 50 | #include <string> |
| 51 | using namespace std; |
| 52 | |
| 53 | #if !defined(__DB_utf_eight_HEAD_INCLUDED) |
| 54 | #define __DB_utf_eight_HEAD_INCLUDED |
| 55 | |
| 56 | /* - All functions return a negative value on failure or if the result |
| 57 | would be empty. |
| 58 | - The FILE parameter is always expected to be a valid |
| 59 | file pointer and it must be opened in binary mode. */ |
| 60 | |
| 61 | |
| 62 | /* Explicitly Writing the UTF-8 ByteOrderMark |
| 63 | (use only once at begin of file) as a signature for external editors */ |
| 64 | int write_bom_utf8(FILE *out); |
| 65 | |
| 66 | // Encoding and Decoding a single UNICODE character to and from file |
| 67 | int encode_utf8(FILE *out, unsigned int code); |
| 68 | int decode_utf8(FILE *in, unsigned int *result); |
| 69 | |
| 70 | // Reading string and wstring from file |
| 71 | /* Starts reading actual character data after ignoring any preceeding newline |
| 72 | or carriage return codes or byte order marks |
| 73 | then reads characters until the next newline or carriage return appears, |
| 74 | the nl or cr character itself is not appended to result) */ |
| 75 | /* (basically means two things: Empty lines will be skipped and windows' CR+NL |
| 76 | will not lead to reading extra empty lines.) */ |
| 77 | /* (result will be replaced with the read line if return value is 0, otherwise |
| 78 | result will not be altered in any way) */ |
| 79 | int readline_utf8(FILE *in, string *result); |
| 80 | int readline_utf8(FILE *in, wstring *result); |
| 81 | |
| 82 | // Writing string and wstring to file (also writes CR+NL after the string) |
| 83 | /* (if FILE is not in binary mode, some platforms(like windows) will do |
| 84 | a conversion from NL to CR+NL, so then actually CR+CR+NL will be written, |
| 85 | which is not really a problem for the readline functions above, this info |
| 86 | is just here, so you do not have to get confused about that, if you should |
| 87 | ever take a closer peek at the output file) */ |
| 88 | int writeline_utf8(FILE *out, string line); |
| 89 | int writeline_utf8(FILE *out, wstring line); |
| 90 | |
| 91 | // Functions for strings that can store the full range of characters |
| 92 | typedef basic_string<unsigned int> uistring; |
| 93 | int readline_utf8(FILE *in, uistring *result); |
| 94 | int writeline_utf8(FILE *out, uistring line); |
| 95 | |
| 96 | #endif // #if !defined(__DB_utf_eight_HEAD_INCLUDED) |